Fri Aug 23 12:42:09 2024

Import the Seurat object: previously processed, annotated with cell labels, assigned gene-set scores using AUCell (cancersea state signature scores), and cells that contain EGFP plasmid fragments have been labeled.

seu <- readRDS(file.path(projDir, 'single_cell_analysis', 'seu.RDS'))
# simplify cell type labels (group rare ones into "other")
seu$celltype <- ifelse(seu$celltype %in% names(which(table(seu$celltype) < 100)), 
                        'other', seu$celltype)
# update group labels 
conditions <- list('PoolA' = 'Recovery', 
               'PoolB' = 'DSS', 
               'PoolC' = 'Untreated')
seu$condition <- paste(conditions[seu$sample])
egfp_cells <- colnames(seu)[seu$egfp_status == 'EGFP_POS']

1 EGFP cells

From the single-cell sequencing of samples under 3 conditions, we managed to pick up about 1.5% of cells to contain at least one unique read originating from the EGFP-plasmid. Those cells are supposed to contain NFKB promoters thus, EGFP-positive cells are likely to contain increased NFKB activity.

knitr::kable(table(seu$egfp_status, seu$condition), 
caption = 'Number of cells per condition categorized by EGFP status')
Number of cells per condition categorized by EGFP status
DSS Recovery Untreated
EGFP_NEG 5257 7153 6561
EGFP_POS 76 116 116

2 UMAPs

Note: Black dots represent cells for which we could pick up at least one EGFP-plasmid fragment.

2.1 celltype

p <- DimPlot(seu, group.by = 'celltype')
df <- p[[1]]$data
p +  geom_point(data = df[egfp_cells,], aes(x = umap_1, y = umap_2), 
             color = 'black', size = 1)

2.2 condition

p <- DimPlot(seu, group.by = 'condition')
df <- p[[1]]$data
p +  geom_point(data = df[egfp_cells,], aes(x = umap_1, y = umap_2), 
             color = 'black', size = 1)

2.3 celltype + condition

p <- DimPlot(seu, group.by = 'celltype', split.by = 'condition')
df <- p[[1]]$data
p +  geom_point(data = df[egfp_cells,], aes(x = umap_1, y = umap_2), 
                color = 'black', size = 1)

3 CancerSEA signatures

Comparison of cancer state signature scores in egfp+ and egfp- cells in different conditions

3.1 By EGFP status

sigs <- colnames(seu@meta.data)[10:23]
dt <- data.table(seu@meta.data)

mdt <- melt.data.table(dt, measure.vars = sigs)

ggplot(mdt, aes(x = value, y = variable)) + 
  geom_density_ridges(aes(fill = egfp_status), alpha = 0.5) +
  scale_fill_manual(values = c('blue', 'red')) + 
  labs(y = 'Signature', x = 'AUCell score')

3.2 By condition

ggplot(mdt, aes(x = value, y = variable)) + 
  geom_density_ridges(aes(fill = condition), alpha = 0.5) +
  labs(y = 'Signature', x = 'AUCell score') 

3.3 By condition + egfp status

ggplot(mdt, aes(x = value, y = variable)) + 
  geom_density_ridges(aes(fill = egfp_status), alpha = 0.5) +
  scale_fill_manual(values = c('blue', 'red')) + 
  labs(y = 'Signature', x = 'AUCell score') +
  facet_grid(~ condition)

4 Marker Analysis - Epithelial Cells

Now, we focus on epithelial cells (cells annotated as epithelial and also are cdh1+) to look for NKFB activity signatures by comparing EGFP+ cells iwth EGFP- cells in different conditions.

4.1 Markers to cross-check with qPCR data

For each gene, we check ratio of epithelial cells expressing the marker in different conditions; split by egfp status

qPCR_markers <- c('Egfp', 'Lgr5', 'Chga', 'Muc2', 'Dclk1', 'Tnfaip3', 'Noxa1', 
                  'Tnf', 'Bcl2l1', 'Bcl2')
M <- GetAssayData(seu)
epithelial_cells <- intersect(colnames(seu)[seu$celltype == 'Epithelial cells'], 
                                      names(which(M['Cdh1',] > 0)))

#  setdiff(intersect(colnames(seu)[seu$celltype == 'Epithelial cells'], 
 #                                     names(which(M['Cdh1',] > 0))),
  #                          names(which(M['Ptprc',] > 0)))
  
seu_epi <- seu[,epithelial_cells]

dt <- data.table(seu_epi@meta.data, keep.rownames = T)
dt <- cbind(dt, sapply(qPCR_markers, function(x) {
  M[x, dt$rn]
}))

df <- do.call(rbind, lapply(qPCR_markers, function(x) {
  s <- dt[,list('percent_expressed' = round(sum(get(x) > 0)/length(get(x)) * 100, 1), 
          'avg_expression' = mean(get(x))),by = c('condition', 'egfp_status')]
  s$gene <- x
  return(s)
}))

df$egfp_status <- factor(df$egfp_status, levels = c('EGFP_POS', 'EGFP_NEG'))
df$condition <- factor(df$condition, levels = c('Untreated', 'DSS', 'Recovery'))
ggplot(df, aes(x = egfp_status, y = percent_expressed)) + 
  geom_bar(stat = 'identity', aes(fill = condition), position = 'dodge') + 
  facet_wrap(~ gene, scales = 'free', nrow = 2) + 
  scale_fill_brewer(type = 'qual', palette = 6)

4.2 Dot Plots of Markers

4.2.1 Top markers

# find markers for egfp+/egpf- in cdh1+ epithelial cells 
markers <- sapply(simplify = F, unique(seu_epi$condition), function(x) {
  message(date(), " => computing markers for condition ",x)
  dt <- data.table(FindMarkers(seu_epi[,seu_epi$condition == x], 
                               min.pct = 0.1, 
                               logFoldChange = 0.25,
                         test.use = 'wilcox', 
                         ident.1 = 'EGFP_POS',
                         ident.2 = 'EGFP_NEG',
                         only.pos = TRUE, 
                         group.by = 'egfp_status'), 
             keep.rownames = T)
  dt$condition <- x
  return(dt)
})
markers <- do.call(rbind, markers) 

m <- markers[pct.1 > 0.15][p_val_adj < 0.1][order(p_val_adj), .SD[1:21], by = .(condition)][!is.na(rn)][rn != 'Egfp']

p <- DotPlot(seu_epi[,seu_epi$egfp_status == 'EGFP_POS'], 
             features = unique(m$rn), 
      group.by = 'condition', scale = F) + coord_flip()

df <- p$data

df$id <- factor(df$id, levels = c('Untreated', 'DSS', 'Recovery'))

ggplot(df, aes(x = features.plot, y = id)) + 
  geom_point(aes(color = avg.exp.scaled, size = pct.exp)) +
  labs(color = "Average\nExpression", size = "Percent\nExpression") + 
#  scale_color_gradient(low = '#672976', high = 'yellow') +  
  scale_color_gradientn(colors = colorRampPalette(c("#313e93", "#b252a2", "#f4a16f", "#eeea59"))(50)) + 
  scale_size(range = c(0, 6)) + 
  theme(axis.title = element_blank(), 
        axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, face = 'italic'), 
        axis.text = element_text(size = 14, family = 'Arial'),
        legend.title = element_text(family = 'Arial'))

4.2.2 Subset of top markers

# subset of genes of interest
# I don't know how these are selected by Marina
goi <- c('Steap4', 'Smim19', 'Cox17', 'Nfkbib', 'Man2a2', 'Sept10', 'Acot13', 'Ndufv2', 'Mrps15', 'Aqp8', 'H2-Eb1', 'Ly6g', 'Atf3')

p <- DotPlot(seu_epi[,seu_epi$egfp_status == 'EGFP_POS'], 
             features = goi, 
      group.by = 'condition', scale = F) + coord_flip()

df <- p$data

df$id <- factor(df$id, levels = c('Untreated', 'DSS', 'Recovery'))

ggplot(df, aes(x = features.plot, y = id)) + 
  geom_point(aes(color = avg.exp.scaled, size = pct.exp)) +
  labs(color = "Average\nExpression", size = "Percent\nExpression") + 
  scale_color_gradientn(colors = colorRampPalette(c("#313e93", "#b252a2", "#f4a16f", "#eeea59"))(50)) +  
  scale_size(range = c(0, 6)) + 
  theme(axis.title = element_blank(), 
        axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, face = 'italic'), 
        axis.text = element_text(size = 14, family = 'Arial'),
        legend.title = element_text(family = 'Arial'))

4.3 Heatmaps

Found top markers expressed in at least 15% of EGFP+ cells and filtered for padj < 0.1

4.3.1 Average Marker Expression

colorPal <- colorRampPalette(c("darkorchid4", "yellow"))(50)
M <- GetAssayData(seu_epi)

#top <- markers[pct.1 > 0.15][p_val_adj < 0.1]

top <- markers[pct.1 > 0.15][p_val_adj < 0.1][order(p_val_adj), .SD[1:20], by = .(condition)][!is.na(rn)]

B <- get_basis_matrix(t(as.matrix(M[unique(top$rn),])), 
                      as.factor(paste(seu_epi$condition, seu_epi$egfp_status)))

sample_order <- c('Untreated EGFP_POS', 'Untreated EGFP_NEG', 
                  'DSS EGFP_POS', 'DSS EGFP_NEG', 
                  'Recovery EGFP_POS', 'Recovery EGFP_NEG')

pheatmap::pheatmap(t(B[sample_order,]), scale = 'row', cluster_cols = F, 
                   cellwidth = 20, fontsize_row = 9, color = colorPal, 
                   gaps_col = c(2,4))

4.3.2 Ratio of cells expressing the marker

# import nfkb target genes; (for marker analysis )
nfkb <- readLines('/data/local/buyar/collaborations/marina_kol/data/nfkb_targets_mouse.txt')
dt <- melt.data.table(markers, id.vars = c('rn', 'condition'), measure.vars = c('pct.1', 'pct.2'))
dt$variable <- ifelse(dt$variable == 'pct.1', 'EGFP_POS', 'EGFP_NEG')
dt$group <- paste(dt$condition, dt$variable)
dtc <- dcast.data.table(dt, rn ~ group, value.var = 'value')
dtc[is.na(dtc)] <- 0
m <- as.matrix(data.frame(dtc[,-1], row.names = dtc[[1]], check.names = F))

genes <- unique(top$rn)
ann_row <- data.frame('condition' = markers[rn %in% genes, .SD[which.min(p_val)],by = rn][match(genes, rn)]$condition, 
                      #'is_nfkb_target' = as.factor(genes %in% nfkb),
                      row.names = genes)

pheatmap::pheatmap(m[genes, sample_order], scale = 'row', 
                   color = colorPal, cellwidth = 20, 
                   main = 'Ratio of cells expressing top markers', 
                   annotation_row = ann_row,
                   gaps_col = c(2, 4),
                   cluster_cols = F, 
                   fontsize = 9, cutree_rows = 3)

5 Revisiting Bulk RNAseq data

We want to apply some things learned from the single-cell data to the bulk RNA-seq data we have analyzed before.

# get RUVSeq normalized counts from bulk rnaseq data 
workdir <- '/data/local/buyar/collaborations/marina_acute_colitis/'
sample_sheet <- read.csv(file.path(workdir, 'data/sample_sheet.csv'))
counts <- read.table(file.path(workdir, '/output_ori/feature_counts/raw_counts/hisat2/counts.tsv'), header = T, check.names = F)
colData <- data.frame(sample_sheet[,-1], row.names = sample_sheet$name)

set <- newSeqExpressionSet(counts = as.matrix(counts),
                           phenoData = colData)
differences <- makeGroups(colData$sample_type)
set_s <- RUVs(set, unique(rownames(set)),
              k=10, differences) #all genes

norm_counts <- log(normCounts(set_s)+1)

5.1 NFKB target genes

Make a heatmap of NFKB target genes (exclusively expressed in Cdh1+ epithelial cells only )

# subset norm_counts for nfkb genes 
ids <- readRDS('/data/local/buyar/datasets/ensmusg2name.RDS')
nfkb_ids <- ids[match(nfkb, name)]$id
M_nfkb <- norm_counts[nfkb_ids,]
# convert to gene names 
rownames(M_nfkb) <- ids[match(rownames(M_nfkb), id)]$name

M <- GetAssayData(seu_epi)
nonepi_cells <- colnames(seu)[seu$celltype != 'Epithelial cells']
M_nonepi <- GetAssayData(seu[,nonepi_cells])
genes <- intersect(rownames(M), rownames(M_nfkb))

# get genes expressed in at least 1% of epithelial cells
genes <- names(which(apply(M[genes,], 1, function(x) (sum(x > 0) / length(x)) * 100) > 0.5))

# remove genes expressed in more than 5% of the non epithelial cells 
genes <- names(which(apply(M_nonepi[genes,], 1, function(x) (sum(x > 0) / length(x)) * 100) < 3))

# keep activated / repressed samples only 
genes <- names(which(rowSums(M_nfkb[genes,]) > 0))

colData$nfkb <- gsub(".+_", "", colData$sample_type)
colData$dss <- gsub("_.+", "", colData$sample_type)

pheatmap::pheatmap(M_nfkb[genes, ], scale = 'row', 
                   annotation_col = colData[,c('nfkb','dss'),drop=F],
                   cluster_cols = F,
                   cellwidth = 15, 
                   color = rev(colorRampPalette(RColorBrewer::brewer.pal(11, "RdYlBu"))(10)), 
                   gaps_col = c(8, 16),
                   cutree_rows = 5, 
                   show_colnames = F)

5.2 GSEA of bulk rnaseq samples

We score bulk rnaseq samples using different gene signatures.

score_gene_sets <- function(exprs, genesets) {
  rankData <- singscore::rankGenes(exprs)
  s <- pbapply::pbsapply(genesets, function(x) {
    x <- intersect(x, rownames(rankData))
    singscore::simpleScore(rankData, upSet = x)[['TotalScore']]
  })
  rownames(s) <- colnames(exprs)
  return(s)
}

find_differential_signatures <- function(scores, samplesA, samplesB) {
  groupA <- intersect(colnames(scores), samplesA)
  groupB <- intersect(colnames(scores), samplesB)

  message("Comparing ",length(groupA), " samples (A) with ",length(groupB), " samples (B)")

  stats <- do.call(rbind, lapply(rownames(scores), function(x) {
    a <- scores[x, groupA]
    b <- scores[x, groupB]
    data.table('signature' = x, 'groupA' = mean(a),
               'groupB' = mean(b),
               'pval' = wilcox.test(a, b)[['p.value']])
  }))
  stats$padj <- p.adjust(stats$pval, method = 'BH')
  stats <- stats[order(pval)]

  return(stats)
}

gex <- data.frame(norm_counts, check.names = F)
# convert to gene names 
ids <- readRDS('/data/local/buyar/datasets/ensmusg2name.RDS')
gex$name <- ids[match(rownames(gex), id)]$name
gex <- gex[!is.na(gex$name),]
gex <- gex[!duplicated(gex$name),]
rownames(gex) <- gex$name
gex$name <- NULL

5.3 EGFP+ markers

For each condition, we found the signature genes that distinguish egfp+ from egfp- cells. Now, we score the bulk samples by these signatures.

m <- markers[pct.1 > 0.15][p_val_adj < 0.1][order(p_val_adj), .SD[1:51], by = .(condition)][rn != 'Egfp']

egfp_signatures <- sapply(simplify = F, unique(m$condition), function(x) m[condition == x]$rn)

egfp_signature_scores <- score_gene_sets(gex, egfp_signatures)
egfp_signature_scores <- egfp_signature_scores[,c('Untreated', 'DSS', 'Recovery')]

pheatmap::pheatmap(t(egfp_signature_scores), cluster_cols = F, 
                   scale = 'row',
                   annotation_col = colData[,c('nfkb','dss'),drop=F],
                   cellwidth = 15,  cellheight = 10,
                   color = colorRampPalette(c("#6298c8", "white", "#ae3d7f"))(10),                    cluster_rows = F, 
                   show_colnames = F, 
                   gaps_col = c(8, 16), 
                   main = 'EGFP+ condition signature scores in bulk rnaseq')

5.4 Immune Cell type markers

xcell <- readRDS('/data/local/buyar/datasets/xCell_signatures.mouse_genenames.RDS')

# immune cell groups from xcell 
immune_cells <- c("B-cells", "CD4+ T-cells", 
                  "CD8+ T-cells", "DC", "Eosinophils", "Macrophages", "Monocytes", 
                  "Mast cells", "Neutrophils", "NK cells")
xcell_scores <- score_gene_sets(gex, xcell[immune_cells])

pheatmap::pheatmap(t(xcell_scores), scale = 'row',
                   annotation_col = colData[,c('nfkb','dss'),drop=F],
                   cellwidth = 15, cellheight = 10,
                   show_colnames = F,
                   cluster_cols = F,
                   gaps_col = c(8, 16), 
                   color = colorRampPalette(c("#6298c8", "white", "#ae3d7f"))(10),
                   main = 'Immune Cell Type Signature Scores in Bulk RNAseq')

6 SessionInfo

print(sessionInfo())
## R version 4.2.3 (2023-03-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so
## 
## locale:
##  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
##  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
##  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
## [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] Seurat_5.0.3                SeuratObject_5.0.1         
##  [3] sp_2.1-4                    RUVSeq_1.32.0              
##  [5] edgeR_3.40.2                limma_3.54.2               
##  [7] EDASeq_2.32.0               ShortRead_1.56.1           
##  [9] GenomicAlignments_1.34.1    SummarizedExperiment_1.28.0
## [11] MatrixGenerics_1.10.0       matrixStats_1.1.0          
## [13] Rsamtools_2.14.0            GenomicRanges_1.50.2       
## [15] Biostrings_2.66.0           GenomeInfoDb_1.34.9        
## [17] XVector_0.38.0              IRanges_2.32.0             
## [19] S4Vectors_0.36.2            BiocParallel_1.32.6        
## [21] Biobase_2.58.0              BiocGenerics_0.44.0        
## [23] ggridges_0.5.6              ggrepel_0.9.4              
## [25] stringr_1.5.0               gprofiler2_0.2.2           
## [27] ggpubr_0.6.0                ggplot2_3.4.2              
## [29] pheatmap_1.0.12             data.table_1.14.8          
## [31] openxlsx_4.2.5.2           
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.3             spatstat.explore_3.2-7 reticulate_1.36.1     
##   [4] R.utils_2.12.3         tidyselect_1.2.0       RSQLite_2.3.1         
##   [7] AnnotationDbi_1.60.2   htmlwidgets_1.6.2      grid_4.2.3            
##  [10] Rtsne_0.17             munsell_0.5.0          codetools_0.2-19      
##  [13] ica_1.0-3              interp_1.1-6           future_1.33.2         
##  [16] miniUI_0.1.1.1         withr_3.0.0            spatstat.random_3.2-3 
##  [19] colorspace_2.1-0       progressr_0.14.0       filelock_1.0.3        
##  [22] highr_0.10             knitr_1.43             rstudioapi_0.15.0     
##  [25] ROCR_1.0-11            tensor_1.5             ggsignif_0.6.4        
##  [28] listenv_0.9.1          labeling_0.4.2         GenomeInfoDbData_1.2.9
##  [31] polyclip_1.10-6        hwriter_1.3.2.1        farver_2.1.1          
##  [34] bit64_4.0.5            parallelly_1.37.1      vctrs_0.6.3           
##  [37] generics_0.1.3         xfun_0.39              BiocFileCache_2.6.1   
##  [40] R6_2.5.1               locfit_1.5-9.8         spatstat.utils_3.0-4  
##  [43] bitops_1.0-7           cachem_1.0.8           DelayedArray_0.24.0   
##  [46] promises_1.2.0.1       BiocIO_1.8.0           scales_1.2.1          
##  [49] gtable_0.3.3           globals_0.16.3         goftest_1.2-3         
##  [52] spam_2.10-0            rlang_1.1.3            splines_4.2.3         
##  [55] rtracklayer_1.58.0     rstatix_0.7.2          lazyeval_0.2.2        
##  [58] spatstat.geom_3.2-9    broom_1.0.5            reshape2_1.4.4        
##  [61] yaml_2.3.7             abind_1.4-5            GenomicFeatures_1.50.4
##  [64] backports_1.4.1        httpuv_1.6.12          tools_4.2.3           
##  [67] ellipsis_0.3.2         jquerylib_0.1.4        RColorBrewer_1.1-3    
##  [70] plyr_1.8.8             Rcpp_1.0.10            progress_1.2.2        
##  [73] zlibbioc_1.44.0        purrr_1.0.1            RCurl_1.98-1.14       
##  [76] prettyunits_1.1.1      deldir_2.0-4           pbapply_1.7-2         
##  [79] cowplot_1.1.1          zoo_1.8-12             cluster_2.1.4         
##  [82] magrittr_2.0.3         scattermore_1.2        RSpectra_0.16-1       
##  [85] lmtest_0.9-40          RANN_2.6.1             fitdistrplus_1.1-11   
##  [88] aroma.light_3.28.0     patchwork_1.2.0        xtable_1.8-4          
##  [91] hms_1.1.3              mime_0.12              evaluate_0.21         
##  [94] XML_3.99-0.14          jpeg_0.1-10            gridExtra_2.3         
##  [97] fastDummies_1.7.3      compiler_4.2.3         biomaRt_2.54.1        
## [100] tibble_3.2.1           KernSmooth_2.23-20     crayon_1.5.2          
## [103] R.oo_1.26.0            htmltools_0.5.5        later_1.3.1           
## [106] tidyr_1.3.0            DBI_1.1.3              dbplyr_2.3.2          
## [109] MASS_7.3-58.2          rappdirs_0.3.3         Matrix_1.6-3          
## [112] car_3.1-2              cli_3.6.2              R.methodsS3_1.8.2     
## [115] parallel_4.2.3         dotCall64_1.1-1        igraph_2.0.3          
## [118] pkgconfig_2.0.3        spatstat.sparse_3.0-3  plotly_4.10.2         
## [121] xml2_1.3.4             annotate_1.76.0        bslib_0.5.0           
## [124] digest_0.6.33          sctransform_0.4.1      RcppAnnoy_0.0.22      
## [127] graph_1.76.0           spatstat.data_3.0-4    rmarkdown_2.23        
## [130] leiden_0.4.3.1         uwot_0.2.2             GSEABase_1.60.0       
## [133] restfulr_0.0.15        curl_5.0.1             shiny_1.7.5.1         
## [136] rjson_0.2.21           nlme_3.1-162           lifecycle_1.0.3       
## [139] jsonlite_1.8.8         carData_3.0-5          viridisLite_0.4.2     
## [142] fansi_1.0.4            pillar_1.9.0           lattice_0.20-45       
## [145] KEGGREST_1.38.0        fastmap_1.1.1          httr_1.4.6            
## [148] survival_3.5-3         glue_1.6.2             zip_2.3.0             
## [151] png_0.1-8              bit_4.0.5              presto_1.0.0          
## [154] stringi_1.7.12         sass_0.4.6             blob_1.2.4            
## [157] singscore_1.18.0       RcppHNSW_0.6.0         latticeExtra_0.6-30   
## [160] memoise_2.0.1          dplyr_1.1.2            irlba_2.3.5.1         
## [163] future.apply_1.11.2